Cargamos el path
import os
myfile = open("path.txt",mode="r")
datapath,repopath = myfile.readlines()
datapath = datapath.strip("\n")
acc = 'accidents_labeled.csv'
vc = 'veh_cas_labeled.csv'
Importamos librerías
import plotly.tools as tls
import plotly as py
import plotly.graph_objs as go
from matplotlib import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
cmap = cm.get_cmap
import seaborn as sns
%matplotlib inline
import io
pd.set_option('display.max_columns', None)
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import streamlit as st
import cufflinks as cf
cf.go_offline()
cf.set_config_file(theme='pearl',sharing='public',offline=True)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
Carga de los datos
# load the accidents_clean_csv file as a dataframe
accidents = pd.read_csv(datapath +'/'+ acc,delimiter=',',encoding='UTF-8-SIG',index_col=0,parse_dates=["Date"])
# load the casualty_vehicle_csv file as a data frame
dfmerged = pd.read_csv(datapath + '/' + vc,delimiter=',',encoding='UTF-8-SIG',index_col=0)
sns.set_context("talk", font_scale=1.1)
fig,ax = plt.subplots(figsize=(14,8))
sns.scatterplot(x="Number_of_Casualties",
y="Number_of_Vehicles",
size="Accident_Severity",
sizes=(20,500),
alpha=0.5,
hue="Speed_limit_2",
data=accidents.iloc[:1000000])
# Put the legend out of the figure
#plt.legend(bbox_to_anchor=(1.01, 1))
#plt.legend(bbox_to_anchor=(1.01, 0.54), borderaxespad=0.)
plt.legend(fontsize=12)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel("Number_of_Casualties",fontsize=15)
plt.ylabel("Number_of_Vehicles",fontsize=15)
plt.title("Scatter plot - Numerical Variables",fontsize=18,fontweight="bold")
plt.tight_layout()
sns.despine(ax=ax, top=True, right=True, left=True, bottom=False);
plt.show()
fig = accidents['Accident_Severity'].value_counts(normalize=True)\
.reset_index().iplot(kind='pie',dimensions=(400,400),
labels='index',values='Accident_Severity',
textinfo='percent+label',hole=0.4,
color = ['lightgreen', 'orange','red'],title='Accident Severity Chart',
asFigure=True,)
fig.update_layout(legend=dict(
yanchor="top",
y=1.15,
xanchor="left",
x=0.85,
),
title_x=0.5,title_y=0.92)
fig.show()
# Utilizarmos resample de la fecha para ponerla como índice y agruparla por mes, de esta manera
# podemos generar el gráfico con el total mensual y luego poder calcular una media con una ventana=12
# que son lso meses del año.
sns.set_style('white')
fig, ax = plt.subplots(figsize=(16,6))
accidents.set_index('Date').resample('M').size().plot(label='Total Month', color='grey', ax=ax)
accidents.set_index('Date').resample('M').size().rolling(window=12).mean()\
.plot(color='lightgreen', linewidth=5, label='Montly average 12 months', ax=ax)
ax.set_title('Accidents per Month', fontsize=18, fontweight='bold')
ax.set(ylabel='Total Count\n', xlabel='Years')
ax.legend(fontsize=10)
ax.set_xlabel('Year',fontsize=15)
ax.set_ylabel('Total counts\n',fontsize=15)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
#ax.set_xticklabels(["1979", "1984", "1989", "1994","1999","2004"], fontsize=12)
#ax.set_yticklabels(["16k", "18k", "20k", "22k","24k"], fontsize=12)
sns.despine(ax=ax, top=True, right=True, left=True, bottom=False);
Por alguna razón los accidentes tienden a bajar a media que van pasando los años, podría indicarnos una clara mejora en los sistemas de seguridad en los automóbiles.
Se puede apreciar un pico alrededor de los años 90, donde probablemente se deba a que fué una época donde las compañías de vehículos comercializaron muchos coches accesibles al usuario medio y con una gran cilindrada y unos sistemas de seguridad menos eficientes que hoy en día.
df1 = accidents.groupby(['Year'])\
.agg({'Accident_Index':'count', 'Number_of_Vehicles': 'sum','Number_of_Casualties': 'sum',})\
.reset_index()
#-------------------------
sns.set_style("white")
x = df1.Year
labels = df1.Year
width = 0.5
Accidentcounts = df1['Accident_Index']
Casualtycounts = df1['Number_of_Casualties']
fig,ax = plt.subplots(figsize=(16,6))
bar1 = ax.bar(x - width/2, Accidentcounts, width, label='Accident counts', color = 'paleturquoise');
bar2 = ax.bar(x + width/2, Casualtycounts, width, label='Casualty counts', color = 'slategrey');
bar1[10].set_color('moccasin')
bar2[10].set_color('lightcoral')
ax.legend(fontsize=10)
ax.set_title('\nAccidents / Casualties \n per Year\n', fontsize=18, fontweight='bold')
ax.set_xlabel('\nYear',fontsize=15)
ax.set_ylabel('Total counts\n',fontsize=15)
ax.legend(fontsize=10)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
ax.set_xticks(x)
sns.despine(ax=ax, top=True, right=True, left=True, bottom=False);
plt.show();
cmap = plt.cm.get_cmap('Spectral')
df2 = accidents.groupby(['Month'])['Accident_Index'].count().reset_index()
months = ['January',
'February',
'March',
'April',
'May',
'June',
'July',
'August',
'September',
'October',
'November',
'December']
df2['Month'] = pd.Categorical(df2['Month'], categories=months, ordered=True)
#df2.sort_values(...) # same as you have now; can use inplace=True
df2 = df2.sort_values(by='Month')
sns.set_style("white")
x = df2['Month']
y = df2['Accident_Index']
fig, ax = plt.subplots(figsize=(16,6))
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
bar1 = ax.bar(x,y,color='cornflowerblue',linewidth=4)
bar1[9].set_color('tomato')
ax.set_title('Accidents per Month', fontsize=18, fontweight='bold')
ax.set_xlabel('\n Month',fontsize=15)
ax.set_ylabel('Total Count\n',fontsize=15)
sns.despine(ax=ax, top=True, right=True, left=True, bottom=False);
plt.show();
2021-12-30 19:50:54.734 INFO matplotlib.category: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. 2021-12-30 19:50:54.737 INFO matplotlib.category: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
df3 = accidents.groupby(['Day_of_Week'])['Accident_Index']\
.count()\
.sort_values(ascending=False)\
.reset_index()
days = ['Sunday',
'Monday',
'Tuesday',
'Wednesday',
'Thursday',
'Friday',
'Saturday']
df3['Day_of_Week'] = pd.Categorical(df3['Day_of_Week'],
categories=days,
ordered=True)
df3 = df3.sort_values(by='Day_of_Week',ascending=True)
# sns.set_style('white')
fig, ax = plt.subplots(figsize=(10,5))
barlist = plt.bar(df3['Day_of_Week'],df3['Accident_Index'],
color='orange')
barlist[5].set_color('plum')
ax.set_title('\nAccidents per Weekday\n',
fontsize=18,
fontweight='bold')
ax.set_xlabel('\n Weekday',fontsize=15)
ax.set_ylabel('Total Count\n',fontsize=15)
# remove all spines
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
sns.despine(ax=ax, top=True,
right=True,
left=True,
bottom=False);
plt.show();
2021-12-30 19:50:56.656 INFO matplotlib.category: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. 2021-12-30 19:50:56.659 INFO matplotlib.category: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
df4 = accidents.groupby(['Day_of_Week','Daytime'])['Accident_Index']\
.count().reset_index()
days = ['Sunday',
'Monday',
'Tuesday',
'Wednesday',
'Thursday',
'Friday',
'Saturday']
df4['Day_of_Week'] = pd.Categorical(df4['Day_of_Week'], categories=days, ordered=True)
df4 = df4.pivot(index='Day_of_Week', columns='Daytime', values='Accident_Index')
fig = df4.iplot(kind="heatmap",
colorscale="Reds",
dimensions=(670,520),
title='Heatmap Daytime vs Weekday',
asFigure=True)
fig.update_layout(
title_x=0.50)
fig = accidents.Hour.iplot(kind='histogram',
bins=40,
theme="white",
title="Accidents by Time",
dimensions=(900,400),
xTitle='Hour of the Day',
yTitle='Acc Count',
colors="darkseagreen",asFigure=True)
fig.update_layout(title_x=0.5,title_y=0.85)
fig.show()